#!/usr/bin/env python
# coding: utf-8

# In[1]:


exec(open('init_path.py').read())
exec((P_Lib/'GasStation.py').read_text())
get_ipython().run_line_magic('matplotlib', 'inline')


# In[2]:


# [f.unlink() for f in (P_GS_Data_Raw / 'PH_FullDay').glob('*.h5')]; # Clear up daily folder


# In[3]:

fuels = ['e5','e10','diesel']
# In[4]:

for fuel in fuels:
    
    files = list((P_GS_Data_Raw / 'PH_Day').glob('*.h5'))
    files = sorted(files, key=lambda f: int(f.name.split('.')[0].split('_')[-1]))
    len(files)


    # In[5]:


    h_open = '7:00'; h_close = '21:00'; version = '7to21'
    (P_GS_Data_Raw / ('PH_FullDay_' + version + '_'+fuel)).mkdir(parents=True, exist_ok=True)


    # In[8]:


    def build_full_day_price_data_hours_assumed(idx_f):
        # opening price: yesterday's last price
        if idx_f==0:
            df_opening = DataFrame(columns=['StID','Time',fuel], dtype=int)
            df_opening['Time'] = df_opening.Time.astype('datetime64[ns]')
        else:
            f_last = files[idx_f-1]
            df = read_hdf(f_last, 'GS')[['StID','Time',fuel]]
            df_opening = df.groupby('StID').last().reset_index()
        # today's price
        f = files[idx_f]
        df = read_hdf(f, 'GS')[['StID','Time',fuel]]
        ymd = f.name.split('.')[0].split('_')[-1]
        # append opening & today's data (except StID that doesn't exist today)
        ls_today_stid_in_yester = list(set(df_opening.StID).intersection(set(df.StID.unique())))
        df = df.append(df_opening[df_opening.StID.isin(ls_today_stid_in_yester)]).sort_values(by=['StID','Time']).reset_index(drop=True)

        # remove rows with same consecutive values
        df = df[df[fuel]!=df.groupby('StID')[fuel].shift(1)]

        # for those without rows of 7am/9pm, add those rows (more info in EN)
        ymdh_open = Timestamp(ymd + ' ' + h_open); ymdh_close = Timestamp(ymd + ' ' + h_close)
        s_stid = set(df.StID.unique())
        df_open = DataFrame(list(s_stid - set(df.loc[df.Time==ymdh_open, 'StID'])), columns = ['StID'])
        df_open['Time'] = ymdh_open; df_open[fuel] = np.nan
        df_close = DataFrame(list(s_stid - set(df.loc[df.Time==ymdh_close, 'StID'])), columns = ['StID'])
        df_close['Time'] = ymdh_close; df_close[fuel] = np.nan
        df = df.append(df_open).append(df_close).sort_values(by=['StID','Time']).reset_index(drop=True)
        # Fill data at 7am & 9pm
        df[fuel] = df.groupby('StID')[fuel].fillna(method='ffill').fillna(-1).astype(int)
        # only keep rows after 7am & before 9pm
        df = df[(df.Time>=ymdh_open) & (df.Time<=ymdh_close)].sort_values(['StID','Time']).reset_index(drop=True)
    #     return df
        df.to_hdf(P_GS_Data_Raw / ('PH_FullDay_' + version + '_'+fuel) / (ymd + '.h5'), 'GS', mode='w', complevel=9, complib='blosc')
    # build_full_day_price_data_hours_assumed(1)

    # In[11]:


    with Pool(6) as p:
        ls = p.map(build_full_day_price_data_hours_assumed, list(range(len(files))))
    #     ls = p.map(build_full_day_price_data_hours_raw, list(range(len(files))))

